###Major League Baseball Data MLB<-read.csv("Major League Baseball Main Stats Altered 2014.csv") MLB MLB<-MLB[,2:6] MLB library("MVA") #perform PCA on the covariance matrix cov.MLB<-cov(MLB) cov.MLB MLB_PCA=eigen(cov.MLB) names(MLB_PCA$values)=names(MLB) #set column names to the vector of eigen vectors #set row names to the vector of eigen vectors colnames(MLB_PCA$vectors)=names(MLB) rownames(MLB_PCA$vectors)=names(MLB) #scree plot MLB_PC.variance=MLB_PCA$values (MLB_PC.prop=MLB_PC.variance/sum(MLB_PC.variance)) plot(1:length(MLB_PC.variance),MLB_PC.variance,main="Scree Plot for Major League Baseball - all variables, covariance matrix", xlab="Principal Component Number", ylab="Principal Component Variance",type="b") #The scree plot shows that PCA on covariance matrix detects one principal component. #Mean Salary variable has the biggest proportion of the common variation. #The rest of the PC are beyond the elbow. #perform PCA on the correlation matrix cor.MLB<-cor(MLB) cor.MLB MLB_PCA1=eigen(cor.MLB) names(MLB_PCA1$values)=names(MLB) #set column names to the vector of eigen vectors #set row names to the vector of eigen vectors colnames(MLB_PCA1$vectors)=names(MLB) rownames(MLB_PCA1$vectors)=names(MLB) #scree plot MLB_PC1.variance=MLB_PCA1$values (MLB_PC1.prop=MLB_PC1.variance/sum(MLB_PC1.variance)) plot(1:length(MLB_PC1.variance),MLB_PC1.variance,main="Scree Plot for Major League Baseball - all variables, correlation matrix", xlab="Principal Component Number", ylab="Principal Component Variance",type="b") #The scree plot shows that PCA on correlation matrix detects one principal component. #Mean Salary variable has the biggest proportion of the common variation again. #but there are two more PC's that are not beyond the elbow. MLB$Earned.Run.Avg <- max(MLB$Earned.Run.Avg) - MLB$Earned.Run.Avg MLB$Errors <- max(MLB$Errors) - MLB$Errors cor.MLB<-cor(MLB) cor.MLB round(eigen(cor.MLB)$values,3) round(eigen(cor.MLB)$vectors,3) MLB_PC1.variance=eigen(cor.MLB)$values plot(1:length(MLB_PC1.variance),MLB_PC1.variance,main="Scree Plot for Major League Baseball - all variables, correlation matrix", xlab="Principal Component Number", ylab="Principal Component Variance",type="b") eigen(cor.MLB[2:5,2:5]) PC.variance<-eigen(cor.MLB)$values PC.variables<-eigen(cor.MLB)$vectors PC.var.prop<-PC.variance/sum(PC.variance) PC.var.prop M <- matrix(c(1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,1,0,1,1,1,1,1),byrow=TRUE,nrow=5) M M%*%PC.var.prop plot(1:5,PC.variance,main="Scree Plot for Baseball Data", xlab="Principal Component Number", ylab="Principal Component Variance",type="b") ###Make the highly paid New York Yankees have a miserable year with ###one of the lowest winning percentages and batting averages. ###Then redo the above analyses. ###First, read in the data again, and adjust it as before. MLB<-read.csv("Major League Baseball Main Stats Altered 2014.csv") MLB MLB<-MLB[,2:6] MLB library("MVA") ###Then change the statistics for the New York Yankees in Row 19. MLB[19,] MLB[19,2] <- 0.377 ###The winning percentage for the Chicago Cubs MLB[19,3] <- 0.245 ###The batting average for the Toronto Blue Jays MLB[19,] ###Then proceed as before. MLB$Earned.Run.Avg <- max(MLB$Earned.Run.Avg) - MLB$Earned.Run.Avg MLB$Errors <- max(MLB$Errors) - MLB$Errors cor.MLB<-cor(MLB) cor.MLB round(eigen(cor.MLB)$values,3) round(eigen(cor.MLB)$vectors,3) eigen(cor.MLB[2:5,2:5]) PC.variance<-eigen(cor.MLB)$values PC.variables<-eigen(cor.MLB)$vectors PC.var.prop<-PC.variance/sum(PC.variance) PC.var.prop M <- matrix(c(1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1,1,0,1,1,1,1,1),byrow=TRUE,nrow=5) M M%*%PC.var.prop plot(1:5,PC.variance,main="Scree Plot for Baseball Data", xlab="Principal Component Number", ylab="Principal Component Variance",type="b")